References:


1. https://statsandr.com/blog/descriptive-statistics-in-r/
2. https://towardsdatascience.com/data-cleaning-with-r-and-the-tidyverse-detecting-missing-values-ea23c519bc62
3. https://rstudio-pubs-static.s3.amazonaws.com/3364_d1a578f521174152b46b19d0c83cbe7e.html
4. https://medium.com/data-science-in-your-pocket/various-data-distributions-in-statistics-362dc92558db
5. https://stackoverflow.com/questions/5570293/add-column-which-contains-binned-values-of-an-integer-column


Import Libraries

library(tidyr)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ dplyr   1.0.7
## ✓ tibble  3.1.5     ✓ stringr 1.4.0
## ✓ readr   2.1.0     ✓ forcats 0.5.1
## ✓ purrr   0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(summarytools)
## 
## Attaching package: 'summarytools'
## The following object is masked from 'package:tibble':
## 
##     view
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
library(dlookr)
## 
## Attaching package: 'dlookr'
## The following object is masked from 'package:tidyr':
## 
##     extract
## The following object is masked from 'package:base':
## 
##     transform

Data Cleaning

Read CSV

data_1990_1999 <- read.csv("../data/resale-flat-prices-based-on-approval-date-1990-1999.csv", quote = "", header = TRUE, sep = ",")
data_2000_2012 <- read.csv("../data/resale-flat-prices-based-on-approval-date-2000-feb-2012.csv", header = TRUE, sep = ",")
data_2012_2014 <- read.csv("../data/resale-flat-prices-based-on-registration-date-from-mar-2012-to-dec-2014.csv", header = TRUE, sep = ",")
data_2015_2016 <- read.csv("../data/resale-flat-prices-based-on-registration-date-from-jan-2015-to-dec-2016.csv", header = TRUE, sep = ",")
data_2017 <- read.csv("../data/resale-flat-prices-based-on-registration-date-from-jan-2017-onwards.csv", header = TRUE, sep = ",")

Data Columns Exploration

print("data_1990_1999:")
## [1] "data_1990_1999:"
str(data_1990_1999)
## 'data.frame':    287196 obs. of  10 variables:
##  $ month              : chr  "1990-01" "1990-01" "1990-01" "1990-01" ...
##  $ town               : chr  "ANG MO KIO" "ANG MO KIO" "ANG MO KIO" "ANG MO KIO" ...
##  $ flat_type          : chr  "1 ROOM" "1 ROOM" "1 ROOM" "1 ROOM" ...
##  $ block              : chr  "309" "309" "309" "309" ...
##  $ street_name        : chr  "ANG MO KIO AVE 1" "ANG MO KIO AVE 1" "ANG MO KIO AVE 1" "ANG MO KIO AVE 1" ...
##  $ storey_range       : chr  "10 TO 12" "04 TO 06" "10 TO 12" "07 TO 09" ...
##  $ floor_area_sqm     : num  31 31 31 31 73 67 67 67 67 67 ...
##  $ flat_model         : chr  "IMPROVED" "IMPROVED" "IMPROVED" "IMPROVED" ...
##  $ lease_commence_date: int  1977 1977 1977 1977 1976 1977 1977 1977 1977 1977 ...
##  $ resale_price       : int  9000 6000 8000 6000 47200 46000 42000 38000 40000 47000 ...
print("data_2000_2012:")
## [1] "data_2000_2012:"
str(data_2000_2012)
## 'data.frame':    369651 obs. of  10 variables:
##  $ month              : chr  "2000-01" "2000-01" "2000-01" "2000-01" ...
##  $ town               : chr  "ANG MO KIO" "ANG MO KIO" "ANG MO KIO" "ANG MO KIO" ...
##  $ flat_type          : chr  "3 ROOM" "3 ROOM" "3 ROOM" "3 ROOM" ...
##  $ block              : chr  "170" "174" "216" "215" ...
##  $ street_name        : chr  "ANG MO KIO AVE 4" "ANG MO KIO AVE 4" "ANG MO KIO AVE 1" "ANG MO KIO AVE 1" ...
##  $ storey_range       : chr  "07 TO 09" "04 TO 06" "07 TO 09" "07 TO 09" ...
##  $ floor_area_sqm     : num  69 61 73 73 67 73 73 68 68 82 ...
##  $ flat_model         : chr  "Improved" "Improved" "New Generation" "New Generation" ...
##  $ lease_commence_date: int  1986 1986 1976 1976 1976 1977 1977 1981 1981 1981 ...
##  $ resale_price       : num  147000 144000 159000 167000 163000 157000 178000 160000 169000 205000 ...
print("data_2015_2016:")
## [1] "data_2015_2016:"
str(data_2015_2016)
## 'data.frame':    37153 obs. of  11 variables:
##  $ month              : chr  "2015-01" "2015-01" "2015-01" "2015-01" ...
##  $ town               : chr  "ANG MO KIO" "ANG MO KIO" "ANG MO KIO" "ANG MO KIO" ...
##  $ flat_type          : chr  "3 ROOM" "3 ROOM" "3 ROOM" "3 ROOM" ...
##  $ block              : chr  "174" "541" "163" "446" ...
##  $ street_name        : chr  "ANG MO KIO AVE 4" "ANG MO KIO AVE 10" "ANG MO KIO AVE 4" "ANG MO KIO AVE 10" ...
##  $ storey_range       : chr  "07 TO 09" "01 TO 03" "01 TO 03" "01 TO 03" ...
##  $ floor_area_sqm     : num  60 68 69 68 68 67 68 68 67 68 ...
##  $ flat_model         : chr  "Improved" "New Generation" "New Generation" "New Generation" ...
##  $ lease_commence_date: int  1986 1981 1980 1979 1980 1980 1980 1981 1978 1985 ...
##  $ remaining_lease    : int  70 65 64 63 64 64 64 65 62 69 ...
##  $ resale_price       : num  255000 275000 285000 290000 290000 ...
print("data_2017:")
## [1] "data_2017:"
str(data_2017)
## 'data.frame':    115302 obs. of  11 variables:
##  $ month              : chr  "2017-01" "2017-01" "2017-01" "2017-01" ...
##  $ town               : chr  "ANG MO KIO" "ANG MO KIO" "ANG MO KIO" "ANG MO KIO" ...
##  $ flat_type          : chr  "2 ROOM" "3 ROOM" "3 ROOM" "3 ROOM" ...
##  $ block              : chr  "406" "108" "602" "465" ...
##  $ street_name        : chr  "ANG MO KIO AVE 10" "ANG MO KIO AVE 4" "ANG MO KIO AVE 5" "ANG MO KIO AVE 10" ...
##  $ storey_range       : chr  "10 TO 12" "01 TO 03" "01 TO 03" "04 TO 06" ...
##  $ floor_area_sqm     : num  44 67 67 68 67 68 68 67 68 67 ...
##  $ flat_model         : chr  "Improved" "New Generation" "New Generation" "New Generation" ...
##  $ lease_commence_date: int  1979 1978 1980 1980 1980 1981 1979 1976 1979 1979 ...
##  $ remaining_lease    : chr  "61 years 04 months" "60 years 07 months" "62 years 05 months" "62 years 01 month" ...
##  $ resale_price       : num  232000 250000 262000 265000 265000 275000 280000 285000 285000 285000 ...
print("data_2012_2014:")
## [1] "data_2012_2014:"
str(data_2012_2014)
## 'data.frame':    52203 obs. of  10 variables:
##  $ month              : chr  "2012-03" "2012-03" "2012-03" "2012-03" ...
##  $ town               : chr  "ANG MO KIO" "ANG MO KIO" "ANG MO KIO" "ANG MO KIO" ...
##  $ flat_type          : chr  "2 ROOM" "2 ROOM" "3 ROOM" "3 ROOM" ...
##  $ block              : chr  "172" "510" "610" "474" ...
##  $ street_name        : chr  "ANG MO KIO AVE 4" "ANG MO KIO AVE 8" "ANG MO KIO AVE 4" "ANG MO KIO AVE 10" ...
##  $ storey_range       : chr  "06 TO 10" "01 TO 05" "06 TO 10" "01 TO 05" ...
##  $ floor_area_sqm     : num  45 44 68 67 67 68 67 67 67 67 ...
##  $ flat_model         : chr  "Improved" "Improved" "New Generation" "New Generation" ...
##  $ lease_commence_date: int  1986 1980 1980 1984 1980 1981 1978 1979 1979 1985 ...
##  $ resale_price       : num  250000 265000 315000 320000 321000 321000 323000 325000 328000 330000 ...

These datasets has 10 variables:
1. data_1990_1999
2. data_2000_2012
3. data_2012_2014

While the remaining datasets has 11 variables:
1. data_2015_2016
2. data_2017

The additional variable is remaining_lease, this variable only available in data from year 2015 onwards

Add new column “remaining_lease” for that 3 datasets and assign “NA”

data_1990_1999$remaining_lease <- NA
data_2000_2012$remaining_lease <- NA
data_2012_2014$remaining_lease <- NA

Data Integration

data <- rbind(data_1990_1999, data_2000_2012)
data <- rbind(data, data_2015_2016)
data <- rbind(data, data_2017)
data <- rbind(data, data_2012_2014)
str(data)
## 'data.frame':    861505 obs. of  11 variables:
##  $ month              : chr  "1990-01" "1990-01" "1990-01" "1990-01" ...
##  $ town               : chr  "ANG MO KIO" "ANG MO KIO" "ANG MO KIO" "ANG MO KIO" ...
##  $ flat_type          : chr  "1 ROOM" "1 ROOM" "1 ROOM" "1 ROOM" ...
##  $ block              : chr  "309" "309" "309" "309" ...
##  $ street_name        : chr  "ANG MO KIO AVE 1" "ANG MO KIO AVE 1" "ANG MO KIO AVE 1" "ANG MO KIO AVE 1" ...
##  $ storey_range       : chr  "10 TO 12" "04 TO 06" "10 TO 12" "07 TO 09" ...
##  $ floor_area_sqm     : num  31 31 31 31 73 67 67 67 67 67 ...
##  $ flat_model         : chr  "IMPROVED" "IMPROVED" "IMPROVED" "IMPROVED" ...
##  $ lease_commence_date: int  1977 1977 1977 1977 1976 1977 1977 1977 1977 1977 ...
##  $ resale_price       : num  9000 6000 8000 6000 47200 46000 42000 38000 40000 47000 ...
##  $ remaining_lease    : chr  NA NA NA NA ...

The merged dataset contains total 861,505 observations (flat resales transaction) and 11 variables.

Data Exploration To Detect Incorrect Data, Handling Incorrect Data, and Transforming Data

summary(data)
##     month               town            flat_type            block          
##  Length:861505      Length:861505      Length:861505      Length:861505     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  street_name        storey_range       floor_area_sqm    flat_model       
##  Length:861505      Length:861505      Min.   : 28.00   Length:861505     
##  Class :character   Class :character   1st Qu.: 73.00   Class :character  
##  Mode  :character   Mode  :character   Median : 93.00   Mode  :character  
##                                        Mean   : 95.67                     
##                                        3rd Qu.:113.00                     
##                                        Max.   :307.00                     
##  lease_commence_date  resale_price     remaining_lease   
##  Min.   :1966        Min.   :   5000   Length:861505     
##  1st Qu.:1980        1st Qu.: 185000   Class :character  
##  Median :1986        Median : 282000   Mode  :character  
##  Mean   :1988        Mean   : 302067                     
##  3rd Qu.:1995        3rd Qu.: 395000                     
##  Max.   :2019        Max.   :1360000
head(data)
##     month       town flat_type block      street_name storey_range
## 1 1990-01 ANG MO KIO    1 ROOM   309 ANG MO KIO AVE 1     10 TO 12
## 2 1990-01 ANG MO KIO    1 ROOM   309 ANG MO KIO AVE 1     04 TO 06
## 3 1990-01 ANG MO KIO    1 ROOM   309 ANG MO KIO AVE 1     10 TO 12
## 4 1990-01 ANG MO KIO    1 ROOM   309 ANG MO KIO AVE 1     07 TO 09
## 5 1990-01 ANG MO KIO    3 ROOM   216 ANG MO KIO AVE 1     04 TO 06
## 6 1990-01 ANG MO KIO    3 ROOM   211 ANG MO KIO AVE 3     01 TO 03
##   floor_area_sqm     flat_model lease_commence_date resale_price
## 1             31       IMPROVED                1977         9000
## 2             31       IMPROVED                1977         6000
## 3             31       IMPROVED                1977         8000
## 4             31       IMPROVED                1977         6000
## 5             73 NEW GENERATION                1976        47200
## 6             67 NEW GENERATION                1977        46000
##   remaining_lease
## 1            <NA>
## 2            <NA>
## 3            <NA>
## 4            <NA>
## 5            <NA>
## 6            <NA>
tail(data)
##          month   town flat_type block    street_name storey_range
## 861500 2014-12 YISHUN    5 ROOM   819   YISHUN ST 81     01 TO 03
## 861501 2014-12 YISHUN    5 ROOM   816   YISHUN ST 81     10 TO 12
## 861502 2014-12 YISHUN EXECUTIVE   325    YISHUN CTRL     10 TO 12
## 861503 2014-12 YISHUN EXECUTIVE   618 YISHUN RING RD     07 TO 09
## 861504 2014-12 YISHUN EXECUTIVE   277   YISHUN ST 22     07 TO 09
## 861505 2014-12 YISHUN EXECUTIVE   277   YISHUN ST 22     04 TO 06
##        floor_area_sqm flat_model lease_commence_date resale_price
## 861500            127   Improved                1987       500000
## 861501            122   Improved                1988       580000
## 861502            146 Maisonette                1988       540000
## 861503            164  Apartment                1992       738000
## 861504            152 Maisonette                1985       592000
## 861505            146 Maisonette                1985       545000
##        remaining_lease
## 861500            <NA>
## 861501            <NA>
## 861502            <NA>
## 861503            <NA>
## 861504            <NA>
## 861505            <NA>

1. month

head(data$month)
## [1] "1990-01" "1990-01" "1990-01" "1990-01" "1990-01" "1990-01"
tail(data$month)
## [1] "2014-12" "2014-12" "2014-12" "2014-12" "2014-12" "2014-12"
  • split into two columns - year and month
  • convert year and month from char to factor
data <- data %>% separate(month, c("year","month"), "-")
head(data)
##   year month       town flat_type block      street_name storey_range
## 1 1990    01 ANG MO KIO    1 ROOM   309 ANG MO KIO AVE 1     10 TO 12
## 2 1990    01 ANG MO KIO    1 ROOM   309 ANG MO KIO AVE 1     04 TO 06
## 3 1990    01 ANG MO KIO    1 ROOM   309 ANG MO KIO AVE 1     10 TO 12
## 4 1990    01 ANG MO KIO    1 ROOM   309 ANG MO KIO AVE 1     07 TO 09
## 5 1990    01 ANG MO KIO    3 ROOM   216 ANG MO KIO AVE 1     04 TO 06
## 6 1990    01 ANG MO KIO    3 ROOM   211 ANG MO KIO AVE 3     01 TO 03
##   floor_area_sqm     flat_model lease_commence_date resale_price
## 1             31       IMPROVED                1977         9000
## 2             31       IMPROVED                1977         6000
## 3             31       IMPROVED                1977         8000
## 4             31       IMPROVED                1977         6000
## 5             73 NEW GENERATION                1976        47200
## 6             67 NEW GENERATION                1977        46000
##   remaining_lease
## 1            <NA>
## 2            <NA>
## 3            <NA>
## 4            <NA>
## 5            <NA>
## 6            <NA>
str(data$year)
##  chr [1:861505] "1990" "1990" "1990" "1990" "1990" "1990" "1990" "1990" ...
str(data$month)
##  chr [1:861505] "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" ...
unique(sort(data$year))
##  [1] "1990" "1991" "1992" "1993" "1994" "1995" "1996" "1997" "1998" "1999"
## [11] "2000" "2001" "2002" "2003" "2004" "2005" "2006" "2007" "2008" "2009"
## [21] "2010" "2011" "2012" "2013" "2014" "2015" "2016" "2017" "2018" "2019"
## [31] "2020" "2021"
unique(sort(data$month))
##  [1] "01" "02" "03" "04" "05" "06" "07" "08" "09" "10" "11" "12"
data %>% summarise(na_year = sum(is.na(year)),
                   na_month = sum(is.na(month)))
##   na_year na_month
## 1       0        0
data$int_year <- as.integer(data$year)

data$year <- as.factor(data$year)
data$month <- as.factor(data$month)
str(data$year)
##  Factor w/ 32 levels "1990","1991",..: 1 1 1 1 1 1 1 1 1 1 ...
str(data$month)
##  Factor w/ 12 levels "01","02","03",..: 1 1 1 1 1 1 1 1 1 1 ...
sort(summary(data$year))
##  1990  1991  1992  2014  2013  2015  1993  2016  2017  2018  2019  2011  2012 
## 12505 12855 14503 16096 16097 17780 18116 19373 20509 21561 22186 22281 23198 
##  2020  1994  2007  2008  1995  2006  2021  2003  2004  2005  2009  1997  2010 
## 23333 26373 26982 27262 27289 27427 27713 29003 29112 30045 30482 31759 34854 
##  2000  1996  2002  2001  1998  1999 
## 34862 34919 36098 38055 51095 57782
sort(summary(data$month))
##    02    12    01    04    05    09    06    11    03    08    10    07 
## 60812 67944 68734 69956 70016 72831 73476 73689 73997 74962 77450 77638
  • there’s no missing value
  • exploration of year by plotting
barplot(table(data$year), ylim=c(0,60000), las = 2)

p <- ggplot(data, aes(x = fct_infreq(year), fill = fct_infreq(year))) +
  geom_bar(width = 1, colour = "black", show.legend = FALSE) +
  xlab("year")

p + coord_flip() + scale_y_continuous(limit = c(0, 60000))

barplot(table(data$month), ylim=c(0,80000), las = 2)

p <- ggplot(data, aes(x = fct_infreq(month), fill = fct_infreq(month))) +
  geom_bar(width = 1, colour = "black", show.legend = FALSE) +
  xlab("month")

p + coord_flip() + scale_y_continuous(limit = c(0, 80000))

  • uniform distribution type
  • the transaction records in every month are about the same, ranges from 60,000 to 80,000
  • July has the highest transaction records
  • February has the lowest transaction records

2. town

head(data$town)
## [1] "ANG MO KIO" "ANG MO KIO" "ANG MO KIO" "ANG MO KIO" "ANG MO KIO"
## [6] "ANG MO KIO"
tail(data$town)
## [1] "YISHUN" "YISHUN" "YISHUN" "YISHUN" "YISHUN" "YISHUN"
str(data$town)
##  chr [1:861505] "ANG MO KIO" "ANG MO KIO" "ANG MO KIO" "ANG MO KIO" ...
  • explore the number of distinct values of town
  • convert to factor
data %>% distinct(town)
##               town
## 1       ANG MO KIO
## 2            BEDOK
## 3           BISHAN
## 4      BUKIT BATOK
## 5      BUKIT MERAH
## 6      BUKIT TIMAH
## 7     CENTRAL AREA
## 8    CHOA CHU KANG
## 9         CLEMENTI
## 10         GEYLANG
## 11         HOUGANG
## 12     JURONG EAST
## 13     JURONG WEST
## 14 KALLANG/WHAMPOA
## 15   MARINE PARADE
## 16      QUEENSTOWN
## 17        SENGKANG
## 18       SERANGOON
## 19        TAMPINES
## 20       TOA PAYOH
## 21       WOODLANDS
## 22          YISHUN
## 23    LIM CHU KANG
## 24       SEMBAWANG
## 25   BUKIT PANJANG
## 26       PASIR RIS
## 27         PUNGGOL
data %>% summarise(na = sum(is.na(town)))
##   na
## 1  0
data$town <- as.factor(data$town)

str(data$town)
##  Factor w/ 27 levels "ANG MO KIO","BEDOK",..: 1 1 1 1 1 1 1 1 1 1 ...
sort(summary(data$town))
##    LIM CHU KANG     BUKIT TIMAH    CENTRAL AREA   MARINE PARADE       SEMBAWANG 
##              64            2377            6650            7640           11601 
##         PUNGGOL          BISHAN       SERANGOON     JURONG EAST KALLANG/WHAMPOA 
##           15605           20258           21765           23566           25075 
##   BUKIT PANJANG         GEYLANG        CLEMENTI      QUEENSTOWN        SENGKANG 
##           25568           26576           26607           27096           27234 
##       TOA PAYOH       PASIR RIS     BUKIT MERAH   CHOA CHU KANG     BUKIT BATOK 
##           29612           31426           31912           35159           41343 
##         HOUGANG      ANG MO KIO       WOODLANDS     JURONG WEST           BEDOK 
##           47248           49543           60667           62547           63328 
##          YISHUN        TAMPINES 
##           65336           75702
  • there’s no missing value
  • exploration by plotting
p <- ggplot(data, aes(x = fct_infreq(town), fill = fct_infreq(town))) +
  geom_bar(width = 1, colour = "black", show.legend = FALSE) +
  xlab("town")

p + coord_flip() + scale_y_continuous(limit = c(0, 80000))

# Zoom in to town with frequency < 10,000
data_filter <- data %>% 
  group_by(town) %>%
  filter(n() < 8000)

p <- ggplot(data_filter, aes(x = fct_infreq(town), fill = fct_infreq(town))) +
  geom_bar(width = 1, colour = "black", show.legend = FALSE) +
  xlab("town")

p + coord_flip() + scale_y_continuous(limit = c(0, 8000))

  • TAMPINES has the highest transaction records, more than 70,000 rows out of total 861,505 rows

  • LIM CHU KANG has the lowest transaction records, less than 2,500 rows

  • both LIM CHU KANG and BUKIT TIMAH have less than 2,500 transaction records, are these outliers?

  • classify towns into region (based on Wei Wen’s file)

data$region <- ifelse (data$town %in% c("BUKIT MERAH","BUKIT TIMAH","GEYLANG","TOA PAYOH",
                                        "BISHAN","KALLANG/WHAMPOA","MARINE PARADE"), "Central",
                       ifelse (data$town %in% c("TAMPINES","BEDOK","PASIR RIS"), "East",
                               ifelse (data$town %in% c("LIM CHU KANG","SEMBAWANG","WOODLANDS","YISHUN"), "North",
                                       ifelse (data$town %in% c("ANG MO KIO","HOUGANG","PUNGGOL","SENGKANG",
                                                                "SERANGOON"), "North-East",
                                               ifelse (data$town %in% c("BUKIT BATOK","BUKIT PANJANG",
                                                                        "CHOA CHU KANG","CLEMENTI",
                                                                        "JURONG EAST","JURONG WEST"), "West",
                                                       "Suburb")))))
                    
head(data)
##   year month       town flat_type block      street_name storey_range
## 1 1990    01 ANG MO KIO    1 ROOM   309 ANG MO KIO AVE 1     10 TO 12
## 2 1990    01 ANG MO KIO    1 ROOM   309 ANG MO KIO AVE 1     04 TO 06
## 3 1990    01 ANG MO KIO    1 ROOM   309 ANG MO KIO AVE 1     10 TO 12
## 4 1990    01 ANG MO KIO    1 ROOM   309 ANG MO KIO AVE 1     07 TO 09
## 5 1990    01 ANG MO KIO    3 ROOM   216 ANG MO KIO AVE 1     04 TO 06
## 6 1990    01 ANG MO KIO    3 ROOM   211 ANG MO KIO AVE 3     01 TO 03
##   floor_area_sqm     flat_model lease_commence_date resale_price
## 1             31       IMPROVED                1977         9000
## 2             31       IMPROVED                1977         6000
## 3             31       IMPROVED                1977         8000
## 4             31       IMPROVED                1977         6000
## 5             73 NEW GENERATION                1976        47200
## 6             67 NEW GENERATION                1977        46000
##   remaining_lease int_year     region
## 1            <NA>     1990 North-East
## 2            <NA>     1990 North-East
## 3            <NA>     1990 North-East
## 4            <NA>     1990 North-East
## 5            <NA>     1990 North-East
## 6            <NA>     1990 North-East
head(data$region)
## [1] "North-East" "North-East" "North-East" "North-East" "North-East"
## [6] "North-East"
tail(data$region)
## [1] "North" "North" "North" "North" "North" "North"
str(data$region)
##  chr [1:861505] "North-East" "North-East" "North-East" "North-East" ...
data$region <- as.factor(data$region)

str(data$region)
##  Factor w/ 6 levels "Central","East",..: 4 4 4 4 4 4 4 4 4 4 ...
sort(summary(data$region))
##     Suburb      North    Central North-East       East       West 
##      33746     137668     143450     161395     170456     214790
  • exploration by plotting
p <- ggplot(data, aes(x = fct_infreq(region), fill = fct_infreq(region))) +
  geom_bar(width = 1, colour = "black", show.legend = FALSE) +
  xlab("region")

p + coord_flip() + scale_y_continuous(limit = c(0, 250000))

  • WEST has the highest transaction records, more than 200,000 rows out of total 861,505 rows
  • Suburb has the lowest transaction records, less than 50,000 rows

3. flat_type

head(data$flat_type)
## [1] "1 ROOM" "1 ROOM" "1 ROOM" "1 ROOM" "3 ROOM" "3 ROOM"
tail(data$flat_type)
## [1] "5 ROOM"    "5 ROOM"    "EXECUTIVE" "EXECUTIVE" "EXECUTIVE" "EXECUTIVE"
str(data$flat_type)
##  chr [1:861505] "1 ROOM" "1 ROOM" "1 ROOM" "1 ROOM" "3 ROOM" "3 ROOM" ...
  • explore the number of distinct values of flat type
data %>% distinct(flat_type)
##          flat_type
## 1           1 ROOM
## 2           3 ROOM
## 3           4 ROOM
## 4           5 ROOM
## 5           2 ROOM
## 6        EXECUTIVE
## 7 MULTI GENERATION
## 8 MULTI-GENERATION
data %>% summarise(na = sum(is.na(flat_type)))
##   na
## 1  0
  • there’s no missing value
  • however, incorrect data spotted: MULTI GENERATION and MULTI-GENERATION
  • standardize to “MULTI-GENERATION”
  • convert to factor
data <- data %>%
  mutate(flat_type = replace(flat_type, flat_type ==  "MULTI GENERATION", "MULTI-GENERATION"))

data %>% distinct(flat_type)
##          flat_type
## 1           1 ROOM
## 2           3 ROOM
## 3           4 ROOM
## 4           5 ROOM
## 5           2 ROOM
## 6        EXECUTIVE
## 7 MULTI-GENERATION
data$flat_type <- as.factor(data$flat_type)

str(data$flat_type)
##  Factor w/ 7 levels "1 ROOM","2 ROOM",..: 1 1 1 1 3 3 3 3 3 3 ...
sort(summary(data$flat_type))
## MULTI-GENERATION           1 ROOM           2 ROOM        EXECUTIVE 
##              525             1284            10326            65436 
##           5 ROOM           3 ROOM           4 ROOM 
##           179795           280095           324044
  • exploration by plotting
p <- ggplot(data, aes(x = fct_infreq(flat_type), fill = fct_infreq(flat_type))) +
  geom_bar(width = 1, colour = "black", show.legend = FALSE) +
  xlab("flat_type")

p + coord_flip() + scale_y_continuous(limit = c(0, 350000))

# Zoom in to flat type with frequency < 5,000
data_filter <- data %>% 
  group_by(flat_type) %>%
  filter(n() < 5000)

p <- ggplot(data_filter, aes(x = fct_infreq(flat_type), fill = fct_infreq(flat_type))) +
  geom_bar(width = 1, colour = "black", show.legend = FALSE) +
  xlab("flat_type")

p + coord_flip() + scale_y_continuous(limit = c(0, 5000))

  • 4 ROOM type HDB has the highest transaction records, more than 300,000 rows out of total 861,505 rows
  • MULTI-GENERATION type HDB has the lowest transaction records, less than 1,000 rows
  • both MULTI-GENERATION and 1 ROOM type HDB have less than 1,500 transaction records, it is logical as not much HDB in these two types

4. block

head(data$block)
## [1] "309" "309" "309" "309" "216" "211"
tail(data$block)
## [1] "819" "816" "325" "618" "277" "277"
str(data$block)
##  chr [1:861505] "309" "309" "309" "309" "216" "211" "202" "235" "235" "232" ...
  • explore the number of distinct values of block
  • convert to factor
data %>% 
  distinct(block) %>%
  head()
##   block
## 1   309
## 2   216
## 3   211
## 4   202
## 5   235
## 6   232
data %>% 
  distinct(block) %>%
  tail()
##      block
## 2562  512B
## 2563  512A
## 2564  509C
## 2565  511A
## 2566  511B
## 2567  512C
data %>% summarise(na = sum(is.na(block)))
##   na
## 1  0
data$block <- as.factor(data$block)

str(data$block)
##  Factor w/ 2567 levels "1","10","100",..: 799 799 799 799 431 414 372 503 503 498 ...
sort(summary(data$block))
##     308     302      43     409     145     412     408     231     157      35 
##    1552    1555    1569    1575    1592    1599    1603    1613    1620    1622 
##     131     141     217     135     419      18     206     148     232     134 
##    1625    1632    1633    1651    1655    1658    1662    1671    1672    1680 
##     417     406     127     219     229     214      54     146      16      19 
##    1681    1690    1698    1699    1700    1705    1716    1719    1720    1727 
##     411     138     403     230      15     212     204     407     118     205 
##    1735    1737    1742    1759    1761    1764    1767    1771    1782    1784 
##      44     142     213     202      23      28      33     208      24     123 
##    1831    1844    1867    1872    1880    1895    1928    1936    1939    1954 
##      17      30     129      21      32     104     126     119      11     128 
##    1956    1960    1971    2051    2065    2106    2137    2139    2144    2169 
##     117     121      10       9     125     211      12     122     130      20 
##    2205    2222    2236    2266    2271    2274    2281    2300    2311    2327 
##     210      22      13      34     124     120     106     103     116     115 
##    2335    2352    2377    2448    2468    2471    2500    2676    2679    2704 
##     111     105       7     112       5     109     102       6     108     114 
##    2732    2755    2756    2762    2812    2844    2848    2883    2993    3047 
##       3     107     113       8       4     101     110       1       2 (Other) 
##    3057    3075    3149    3156    3192    3249    3263    3859    4423  648207
  • there’s no missing value
  • exploration by plotting
  • select blocks of top 10 highest transaction records and top 10 lowest transaction records
groupby_block <- data  %>% 
     group_by(block) %>%
     summarise(n = n()) %>%
     arrange(n, decreasing = TRUE)

btm10 <- head(groupby_block,10)
top10 <- tail(groupby_block,10)

# TOP 10 block with highest transactions
data_filter <- data %>% 
  filter(block %in% top10$block)

p <- ggplot(data_filter, aes(x = fct_infreq(block), fill = fct_infreq(block))) +
  geom_bar(width = 1, colour = "black", show.legend = FALSE) +
  xlab("block")

p + coord_flip() + scale_y_continuous(limit = c(0, 5000))

# BOTTOM 10 block with lowest transactions
data_filter <- data %>% 
  filter(block %in% btm10$block)

p <- ggplot(data_filter, aes(x = fct_infreq(block), fill = fct_infreq(block))) +
  geom_bar(width = 1, colour = "black", show.legend = FALSE) +
  xlab("block")

p + coord_flip() + scale_y_continuous(limit = c(0, 10))

5. street_name

head(data$street_name)
## [1] "ANG MO KIO AVE 1" "ANG MO KIO AVE 1" "ANG MO KIO AVE 1" "ANG MO KIO AVE 1"
## [5] "ANG MO KIO AVE 1" "ANG MO KIO AVE 3"
tail(data$street_name)
## [1] "YISHUN ST 81"   "YISHUN ST 81"   "YISHUN CTRL"    "YISHUN RING RD"
## [5] "YISHUN ST 22"   "YISHUN ST 22"
str(data$street_name)
##  chr [1:861505] "ANG MO KIO AVE 1" "ANG MO KIO AVE 1" "ANG MO KIO AVE 1" ...
  • explore the number of distinct values of street_name
  • convert to factor
data %>% 
  distinct(street_name) %>%
  head()
##         street_name
## 1  ANG MO KIO AVE 1
## 2  ANG MO KIO AVE 3
## 3  ANG MO KIO AVE 4
## 4 ANG MO KIO AVE 10
## 5  ANG MO KIO AVE 5
## 6  ANG MO KIO AVE 8
data %>%
  distinct(street_name) %>%
  tail()
##        street_name
## 571 TAMPINES ST 86
## 572 KEAT HONG LINK
## 573 ALJUNIED AVE 2
## 574    SUMANG LANE
## 575  CANBERRA CRES
## 576    CANBERRA ST
data %>% summarise(na = sum(is.na(street_name)))
##   na
## 1  0
data$street_name <- as.factor(data$street_name)

str(data$street_name)
##  Factor w/ 576 levels "ADMIRALTY DR",..: 14 14 14 14 14 17 17 17 17 17 ...
sort(summary(data$street_name))
##      TAMPINES ST 33         GHIM MOH RD      BEDOK NTH ST 2      TAMPINES ST 71 
##                2699                2707                2721                2736 
##          EUNOS CRES      TAMPINES ST 82      TAMPINES ST 45   JURONG EAST ST 32 
##                2743                2783                2784                2821 
##          MARINE TER  POTONG PASIR AVE 1     SERANGOON AVE 4        YISHUN ST 71 
##                2823                2831                2840                2864 
##          PENDING RD      TAMPINES ST 12        BOON LAY AVE     WOODLANDS DR 50 
##                2868                2887                2895                2900 
##      BT BATOK ST 52   JURONG WEST ST 61      CLEMENTI AVE 5     WOODLANDS DR 14 
##                2904                2910                2938                2954 
##       YISHUN AVE 11       PUNGGOL FIELD      TAMPINES ST 11  CHOA CHU KANG CTRL 
##                2958                2962                2965                2980 
##        RIVERVALE DR        JLN BT MERAH   NEW UPP CHANGI RD CHOA CHU KANG AVE 3 
##                2981                2984                2986                3034 
##         JELAPANG RD  TELOK BLANGAH CRES     LOR 8 TOA PAYOH         C'WEALTH DR 
##                3076                3081                3144                3159 
##       HOUGANG AVE 3      BT BATOK ST 21   JURONG EAST ST 24  BT PANJANG RING RD 
##                3190                3212                3216                3226 
##            PETIR RD      BT BATOK ST 31      TAMPINES ST 81        YISHUN ST 22 
##                3233                3268                3290                3392 
##    WOODLANDS CIRCLE   JURONG WEST ST 65        YISHUN ST 72  CLEMENTI WEST ST 2 
##                3414                3472                3481                3492 
##        YISHUN AVE 5     PASIR RIS ST 71 SERANGOON NTH AVE 4       HOUGANG AVE 5 
##                3493                3496                3501                3574 
##      CLEMENTI AVE 2       TEBAN GDNS RD   JURONG WEST AVE 1   JURONG WEST ST 91 
##                3590                3595                3614                3623 
##     TANGLIN HALT RD       HOUGANG AVE 1 BT BATOK EAST AVE 5      TAMPINES ST 83 
##                3655                3664                3689                3709 
##  CHOA CHU KANG CRES   WOODLANDS RING RD         BOON LAY PL CHOA CHU KANG AVE 4 
##                3712                3759                3882                3949 
##      HOUGANG AVE 10     PASIR RIS ST 11        YISHUN AVE 6 SERANGOON NTH AVE 1 
##                3951                4000                4092                4104 
##      PASIR RIS DR 6        BISHAN ST 12   JURONG WEST ST 41      TAMPINES ST 41 
##                4173                4196                4215                4280 
##     PASIR RIS ST 21      TECK WHYE LANE     WOODLANDS ST 13        YISHUN ST 81 
##                4283                4308                4382                4396 
##           UBI AVE 1   JURONG WEST ST 52        BISHAN ST 13        YISHUN ST 61 
##                4458                4463                4506                4518 
##   JURONG WEST ST 81        YISHUN ST 21      CLEMENTI AVE 4        YISHUN ST 11 
##                4595                4690                4697                4710 
##      TAMPINES ST 22    ANG MO KIO AVE 1          CIRCUIT RD BT BATOK WEST AVE 6 
##                4819                4824                4942                4964 
##   JURONG EAST ST 21     LOR 1 TOA PAYOH          SIMEI ST 1    ANG MO KIO AVE 5 
##                5061                5077                5298                6158 
##   JURONG WEST ST 42        MARSILING DR    ANG MO KIO AVE 4        BEDOK NTH RD 
##                6203                6360                6936                7146 
##      BEDOK NTH ST 3      TAMPINES ST 21       HOUGANG AVE 8    ANG MO KIO AVE 3 
##                7239                7929                8877               11674 
##   ANG MO KIO AVE 10  BEDOK RESERVOIR RD      YISHUN RING RD             (Other) 
##               13227               14068               16597              435780
  • there’s no missing value
  • exploration by plotting
  • select street name of top 10 highest transaction records and top 10 lowest transaction records
groupby_street <- data  %>% 
     group_by(street_name) %>%
     summarise(n = n()) %>%
     arrange(n, decreasing = TRUE)

btm10 <- head(groupby_street,10)
top10 <- tail(groupby_street,10)

# TOP 10 street_name with highest transactions
data_filter <- data %>% 
  filter(street_name %in% top10$street_name)

p <- ggplot(data_filter, aes(x = fct_infreq(street_name), fill = fct_infreq(street_name))) +
  geom_bar(width = 1, colour = "black", show.legend = FALSE) +
  xlab("street_name")

p + coord_flip() + scale_y_continuous(limit = c(0, 20000))

# BOTTOM 10 street_name with lowest transactions
data_filter <- data %>% 
  filter(street_name %in% btm10$street_name)

p <- ggplot(data_filter, aes(x = fct_infreq(street_name), fill = fct_infreq(street_name))) +
  geom_bar(width = 1, colour = "black", show.legend = FALSE) +
  xlab("street_name")

p + coord_flip() + scale_y_continuous(limit = c(0, 25))

6. storey_range

head(data$storey_range)
## [1] "10 TO 12" "04 TO 06" "10 TO 12" "07 TO 09" "04 TO 06" "01 TO 03"
tail(data$storey_range)
## [1] "01 TO 03" "10 TO 12" "10 TO 12" "07 TO 09" "07 TO 09" "04 TO 06"
str(data$storey_range)
##  chr [1:861505] "10 TO 12" "04 TO 06" "10 TO 12" "07 TO 09" "04 TO 06" ...
  • explore the number of distinct values of storey_range
unique(sort(data$storey_range))
##  [1] "01 TO 03" "01 TO 05" "04 TO 06" "06 TO 10" "07 TO 09" "10 TO 12"
##  [7] "11 TO 15" "13 TO 15" "16 TO 18" "16 TO 20" "19 TO 21" "21 TO 25"
## [13] "22 TO 24" "25 TO 27" "26 TO 30" "28 TO 30" "31 TO 33" "31 TO 35"
## [19] "34 TO 36" "36 TO 40" "37 TO 39" "40 TO 42" "43 TO 45" "46 TO 48"
## [25] "49 TO 51"
data %>% summarise(na = sum(is.na(storey_range)))
##   na
## 1  0
  • there’s no missing value
  • split into two columns - storey_range_from and storey_range_to
  • add new column - storey_range_mean (mean of storey_range_from and storey_range_to)
  • convert new columns to int
  • convert storey_range to factor
data <- data %>% separate(storey_range, c("storey_range_from","storey_range_to"), " TO ", remove = FALSE)

unique(sort(data$storey_range_from))
##  [1] "01" "04" "06" "07" "10" "11" "13" "16" "19" "21" "22" "25" "26" "28" "31"
## [16] "34" "36" "37" "40" "43" "46" "49"
unique(sort(data$storey_range_to))
##  [1] "03" "05" "06" "09" "10" "12" "15" "18" "20" "21" "24" "25" "27" "30" "33"
## [16] "35" "36" "39" "40" "42" "45" "48" "51"
data$storey_range_from <- as.integer(data$storey_range_from)
data$storey_range_to <- as.integer(data$storey_range_to)

#data$storey_range_mean <- pmin(data$storey_range_from, data$storey_range_to)
data$storey_range_mean <- (data$storey_range_from + data$storey_range_to) / 2

head(data)
##   year month       town flat_type block      street_name storey_range
## 1 1990    01 ANG MO KIO    1 ROOM   309 ANG MO KIO AVE 1     10 TO 12
## 2 1990    01 ANG MO KIO    1 ROOM   309 ANG MO KIO AVE 1     04 TO 06
## 3 1990    01 ANG MO KIO    1 ROOM   309 ANG MO KIO AVE 1     10 TO 12
## 4 1990    01 ANG MO KIO    1 ROOM   309 ANG MO KIO AVE 1     07 TO 09
## 5 1990    01 ANG MO KIO    3 ROOM   216 ANG MO KIO AVE 1     04 TO 06
## 6 1990    01 ANG MO KIO    3 ROOM   211 ANG MO KIO AVE 3     01 TO 03
##   storey_range_from storey_range_to floor_area_sqm     flat_model
## 1                10              12             31       IMPROVED
## 2                 4               6             31       IMPROVED
## 3                10              12             31       IMPROVED
## 4                 7               9             31       IMPROVED
## 5                 4               6             73 NEW GENERATION
## 6                 1               3             67 NEW GENERATION
##   lease_commence_date resale_price remaining_lease int_year     region
## 1                1977         9000            <NA>     1990 North-East
## 2                1977         6000            <NA>     1990 North-East
## 3                1977         8000            <NA>     1990 North-East
## 4                1977         6000            <NA>     1990 North-East
## 5                1976        47200            <NA>     1990 North-East
## 6                1977        46000            <NA>     1990 North-East
##   storey_range_mean
## 1                11
## 2                 5
## 3                11
## 4                 8
## 5                 5
## 6                 2
unique(sort(data$storey_range_mean))
##  [1]  2  3  5  8 11 13 14 17 18 20 23 26 28 29 32 33 35 38 41 44 47 50
str(data$storey_range_from)
##  int [1:861505] 10 4 10 7 4 1 7 10 4 1 ...
str(data$storey_range_to)
##  int [1:861505] 12 6 12 9 6 3 9 12 6 3 ...
str(data$storey_range_mean)
##  num [1:861505] 11 5 11 8 5 2 8 11 5 2 ...
data$storey_range <- as.factor(data$storey_range)
sort(summary(data$storey_range))
## 31 TO 35 36 TO 40 49 TO 51 46 TO 48 26 TO 30 43 TO 45 21 TO 25 40 TO 42 
##        2        7       12       36       39       39       92      180 
## 16 TO 20 37 TO 39 34 TO 36 31 TO 33 28 TO 30 11 TO 15 06 TO 10 01 TO 05 
##      265      361      377      401     1241     1259     2474     2700 
## 25 TO 27 22 TO 24 19 TO 21 16 TO 18 13 TO 15 10 TO 12 01 TO 03 07 TO 09 
##     2847     6584    10131    21168    55716   166556   175050   196264 
## 04 TO 06 
##   217704
  • exploration by plotting
barplot(table(data$storey_range), ylim=c(0,250000), las = 2)

#barplot(table(data$storey_range_mean), ylim=c(0,250000), las = 2)

p <- ggplot(data, aes(x = fct_infreq(storey_range), fill = fct_infreq(storey_range))) +
  geom_bar(width = 1, colour = "black", show.legend = FALSE) +
  xlab("storey_range")

p + coord_flip() + scale_y_continuous(limit = c(0, 250000))

# Zoom in to storey_range with frequency < 5,000
data_filter <- data %>% 
  group_by(storey_range) %>%
  filter(n() < 3000)

p <- ggplot(data_filter, aes(x = fct_infreq(storey_range), fill = fct_infreq(storey_range))) +
  geom_bar(width = 1, colour = "black", show.legend = FALSE) +
  xlab("storey_range")

p + coord_flip() + scale_y_continuous(limit = c(0, 3000))

  • Storey range of 04 TO 06 has the highest transaction records, more than 200,000 rows out of total 861,505 rows
  • Storey range of 31 TO 35 has the lowest transaction records, less than 500 rows
  • Most of the HDB are <= 12 floors
  • There is a significant low transaction records for 01 TO 05 (mean = 3), 06 TO 10 (mean = 8), and 11 TO 15 (mean = 13), are these outliers?
  • The floor numbers are overlapping, how can we handle this? Use mean value?
barplot(table(data$storey_range_mean), ylim=c(0,250000), las = 2)

7. floor_area_sqm

head(data$floor_area_sqm)
## [1] 31 31 31 31 73 67
tail(data$floor_area_sqm)
## [1] 127 122 146 164 152 146
str(data$floor_area_sqm)
##  num [1:861505] 31 31 31 31 73 67 67 67 67 67 ...
data %>% summarise(na = sum(is.nan(floor_area_sqm)))
##   na
## 1  0
summary(data$floor_area_sqm)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   28.00   73.00   93.00   95.67  113.00  307.00
  • there’s no missing value
  • exploration by plotting
barplot(table(data$floor_area_sqm), ylim=c(0,70000), las = 2)

data_plot <- data

data_plot$floor_area_sqm <- as.factor(data_plot$floor_area_sqm)
p <- ggplot(data_plot, aes(x = fct_infreq(floor_area_sqm), fill = fct_infreq(floor_area_sqm))) +
  geom_bar(width = 1, colour = "black", show.legend = FALSE) +
  xlab("floor_area_sqm")

p + coord_flip() + scale_y_continuous(limit = c(0, 350000))

# Zoom in to flat type with frequency < 5,000
data_filter <- data_plot %>% 
  group_by(floor_area_sqm) %>%
  filter(n() < 5000)

p <- ggplot(data_filter, aes(x = fct_infreq(floor_area_sqm), fill = fct_infreq(floor_area_sqm))) +
  geom_bar(width = 1, colour = "black", show.legend = FALSE) +
  xlab("floor_area_sqm")

p + coord_flip() + scale_y_continuous(limit = c(0, 5000))

  • binning
summary(data$floor_area_sqm)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   28.00   73.00   93.00   95.67  113.00  307.00
max(data$floor_area_sqm) -  min(data$floor_area_sqm)
## [1] 279
lower_boundary <- c(seq(27, 287, by=20))
upper_boundary <- c(seq(47, 307, by=20))
lower_boundary
##  [1]  27  47  67  87 107 127 147 167 187 207 227 247 267 287
upper_boundary
##  [1]  47  67  87 107 127 147 167 187 207 227 247 267 287 307
bin_label <- c(paste(paste(lower_boundary, "-"),upper_boundary))
bin_label
##  [1] "27 - 47"   "47 - 67"   "67 - 87"   "87 - 107"  "107 - 127" "127 - 147"
##  [7] "147 - 167" "167 - 187" "187 - 207" "207 - 227" "227 - 247" "247 - 267"
## [13] "267 - 287" "287 - 307"
# range 279 divided into 14 bins - around 20 values in 1 bin
data$floor_area_sqm_bins <- cut(data$floor_area_sqm, breaks=c(seq(27, 307, by=20)), labels=bin_label)

data %>% select(floor_area_sqm, floor_area_sqm_bins) %>% head()
##   floor_area_sqm floor_area_sqm_bins
## 1             31             27 - 47
## 2             31             27 - 47
## 3             31             27 - 47
## 4             31             27 - 47
## 5             73             67 - 87
## 6             67             47 - 67
str(data$floor_area_sqm_bins)
##  Factor w/ 14 levels "27 - 47","47 - 67",..: 1 1 1 1 3 2 2 2 2 2 ...
summary(data$floor_area_sqm_bins)
##   27 - 47   47 - 67   67 - 87  87 - 107 107 - 127 127 - 147 147 - 167 167 - 187 
##      9539    153445    179352    255087    171907     67446     21821      2216 
## 187 - 207 207 - 227 227 - 247 247 - 267 267 - 287 287 - 307 
##       605        31        31        18         4         3
data %>% filter(is.na(floor_area_sqm_bins)) %>% select(floor_area_sqm, floor_area_sqm_bins) %>% head()
## [1] floor_area_sqm      floor_area_sqm_bins
## <0 rows> (or 0-length row.names)
barplot(table(data$floor_area_sqm_bins), ylim=c(0,300000), las = 2)

p <- ggplot(data, aes(x = fct_infreq(floor_area_sqm_bins), fill = fct_infreq(floor_area_sqm_bins))) +
  geom_bar(width = 1, colour = "black", show.legend = FALSE) +
  xlab("floor_area_sqm")

p + coord_flip() + scale_y_continuous(limit = c(0, 300000))

# Zoom in to flat type with frequency < 5,000
data_filter <- data %>% 
  group_by(floor_area_sqm_bins) %>%
  filter(n() < 50)

p <- ggplot(data_filter, aes(x = fct_infreq(floor_area_sqm_bins), fill = fct_infreq(floor_area_sqm_bins))) +
  geom_bar(width = 1, colour = "black", show.legend = FALSE) +
  xlab("floor_area_sqm")

p + coord_flip() + scale_y_continuous(limit = c(0, 50))

  • explore whether the min (28) and max (307) floor area in sqm are logical
  • Floor area (sqm) from 87-107 has the highest transaction records, more than 250,000 rows out of total 861,505 rows
  • Floor area (sqm) from 287-307 has the lowest transaction records, less than 5 rows
  • Most of the HDB are <= 12 floors
  • There are 5 bins have < 50 transaction records, are these outliers?

8. flat_model

head(data$flat_model)
## [1] "IMPROVED"       "IMPROVED"       "IMPROVED"       "IMPROVED"      
## [5] "NEW GENERATION" "NEW GENERATION"
tail(data$flat_model)
## [1] "Improved"   "Improved"   "Maisonette" "Apartment"  "Maisonette"
## [6] "Maisonette"
str(data$flat_model)
##  chr [1:861505] "IMPROVED" "IMPROVED" "IMPROVED" "IMPROVED" ...
  • explore the number of distinct values of flat_model
unique(sort(data$flat_model))
##  [1] "2-room"                 "2-ROOM"                 "Adjoined flat"         
##  [4] "Apartment"              "APARTMENT"              "DBSS"                  
##  [7] "Improved"               "IMPROVED"               "Improved-Maisonette"   
## [10] "IMPROVED-MAISONETTE"    "Maisonette"             "MAISONETTE"            
## [13] "Model A"                "MODEL A"                "Model A-Maisonette"    
## [16] "MODEL A-MAISONETTE"     "Model A2"               "Multi Generation"      
## [19] "MULTI GENERATION"       "New Generation"         "NEW GENERATION"        
## [22] "Premium Apartment"      "PREMIUM APARTMENT"      "Premium Apartment Loft"
## [25] "Premium Maisonette"     "Simplified"             "SIMPLIFIED"            
## [28] "Standard"               "STANDARD"               "Terrace"               
## [31] "TERRACE"                "Type S1"                "Type S2"
data %>% summarise(na = sum(is.na(flat_model)))
##   na
## 1  0
  • there’s no missing value
  • however spotted same flat_model in both lowercase and uppercase
  • update all to uppercase
  • convert to factor
data$flat_model <- toupper(data$flat_model)
unique(sort(data$flat_model))
##  [1] "2-ROOM"                 "ADJOINED FLAT"          "APARTMENT"             
##  [4] "DBSS"                   "IMPROVED"               "IMPROVED-MAISONETTE"   
##  [7] "MAISONETTE"             "MODEL A"                "MODEL A-MAISONETTE"    
## [10] "MODEL A2"               "MULTI GENERATION"       "NEW GENERATION"        
## [13] "PREMIUM APARTMENT"      "PREMIUM APARTMENT LOFT" "PREMIUM MAISONETTE"    
## [16] "SIMPLIFIED"             "STANDARD"               "TERRACE"               
## [19] "TYPE S1"                "TYPE S2"
data$flat_model <- as.factor(data$flat_model)

str(data$flat_model)
##  Factor w/ 20 levels "2-ROOM","ADJOINED FLAT",..: 5 5 5 5 12 12 12 12 12 12 ...
sort(summary(data$flat_model))
##                 2-ROOM PREMIUM APARTMENT LOFT     PREMIUM MAISONETTE 
##                     52                     77                     83 
##    IMPROVED-MAISONETTE                TYPE S2                TYPE S1 
##                    118                    175                    350 
##       MULTI GENERATION                TERRACE          ADJOINED FLAT 
##                    525                    668                   1151 
##     MODEL A-MAISONETTE                   DBSS               MODEL A2 
##                   1959                   2469                   9433 
##             MAISONETTE              APARTMENT      PREMIUM APARTMENT 
##                  28013                  33379                  39555 
##               STANDARD             SIMPLIFIED         NEW GENERATION 
##                  40863                  55122                 181433 
##               IMPROVED                MODEL A 
##                 226103                 239977
  • exploration by plotting
p <- ggplot(data, aes(x = fct_infreq(flat_model), fill = fct_infreq(flat_model))) +
  geom_bar(width = 1, colour = "black", show.legend = FALSE) +
  xlab("flat_model")

p + coord_flip() + scale_y_continuous(limit = c(0, 250000))

# Zoom in to flat_model with frequency < 5,000
data_filter <- data %>%
  group_by(flat_model) %>%
  filter(n() < 3000)

p <- ggplot(data_filter, aes(x = fct_infreq(flat_model), fill = fct_infreq(flat_model))) +
  geom_bar(width = 1, colour = "black", show.legend = FALSE) +
  xlab("flat_model")

p + coord_flip() + scale_y_continuous(limit = c(0, 3000))

  • regroup flat model (based on Wei Wen’s file)
data$new_flat_model <- ifelse (data$flat_model %in% c("2-ROOM","APARTMENT","ADJOINED FLAT","DBSS",
                                        "IMPROVED","MODEL A","MODEL A2"), "Flat",
                       ifelse (data$flat_model %in% c("MAISONETTE","IMPROVED-MAISONETTE",
                                                      "MODEL A-MAISONETTE"), "Maisonette",
                               ifelse (grepl("PREMIUM", data$flat_model), "Premium",
                                       ifelse (data$flat_model %in% c("TERRACE","TYPE S1","TYPE S2"), "Terrace",
                                                       "Others"))))
                    
head(data)
##   year month       town flat_type block      street_name storey_range
## 1 1990    01 ANG MO KIO    1 ROOM   309 ANG MO KIO AVE 1     10 TO 12
## 2 1990    01 ANG MO KIO    1 ROOM   309 ANG MO KIO AVE 1     04 TO 06
## 3 1990    01 ANG MO KIO    1 ROOM   309 ANG MO KIO AVE 1     10 TO 12
## 4 1990    01 ANG MO KIO    1 ROOM   309 ANG MO KIO AVE 1     07 TO 09
## 5 1990    01 ANG MO KIO    3 ROOM   216 ANG MO KIO AVE 1     04 TO 06
## 6 1990    01 ANG MO KIO    3 ROOM   211 ANG MO KIO AVE 3     01 TO 03
##   storey_range_from storey_range_to floor_area_sqm     flat_model
## 1                10              12             31       IMPROVED
## 2                 4               6             31       IMPROVED
## 3                10              12             31       IMPROVED
## 4                 7               9             31       IMPROVED
## 5                 4               6             73 NEW GENERATION
## 6                 1               3             67 NEW GENERATION
##   lease_commence_date resale_price remaining_lease int_year     region
## 1                1977         9000            <NA>     1990 North-East
## 2                1977         6000            <NA>     1990 North-East
## 3                1977         8000            <NA>     1990 North-East
## 4                1977         6000            <NA>     1990 North-East
## 5                1976        47200            <NA>     1990 North-East
## 6                1977        46000            <NA>     1990 North-East
##   storey_range_mean floor_area_sqm_bins new_flat_model
## 1                11             27 - 47           Flat
## 2                 5             27 - 47           Flat
## 3                11             27 - 47           Flat
## 4                 8             27 - 47           Flat
## 5                 5             67 - 87         Others
## 6                 2             47 - 67         Others
head(data$new_flat_model)
## [1] "Flat"   "Flat"   "Flat"   "Flat"   "Others" "Others"
tail(data$new_flat_model)
## [1] "Flat"       "Flat"       "Maisonette" "Flat"       "Maisonette"
## [6] "Maisonette"
str(data$new_flat_model)
##  chr [1:861505] "Flat" "Flat" "Flat" "Flat" "Others" "Others" "Others" ...
data$new_flat_model <- as.factor(data$new_flat_model)

str(data$new_flat_model)
##  Factor w/ 5 levels "Flat","Maisonette",..: 1 1 1 1 3 3 3 3 3 3 ...
sort(summary(data$new_flat_model))
##    Terrace Maisonette    Premium     Others       Flat 
##       1193      30090      39715     277943     512564
  • exploration by plotting
p <- ggplot(data, aes(x = fct_infreq(new_flat_model), fill = fct_infreq(new_flat_model))) +
  geom_bar(width = 1, colour = "black", show.legend = FALSE) +
  xlab("new_flat_model")

p + coord_flip() + scale_y_continuous(limit = c(0, 550000))

# Zoom in to new_flat_model with frequency < 5,000
data_filter <- data %>%
  group_by(new_flat_model) %>%
  filter(n() < 5000)

p <- ggplot(data_filter, aes(x = fct_infreq(new_flat_model), fill = fct_infreq(new_flat_model))) +
  geom_bar(width = 1, colour = "black", show.legend = FALSE) +
  xlab("new_flat_model")

p + coord_flip() + scale_y_continuous(limit = c(0, 5000))

  • Flat type HDB has the highest transaction records, more than 500,000 rows out of total 861,505 rows
  • Terrace type HDB has the lowest transaction records, less than 1,500 rows

9. lease_commence_date

head(data$lease_commence_date)
## [1] 1977 1977 1977 1977 1976 1977
tail(data$lease_commence_date)
## [1] 1987 1988 1988 1992 1985 1985
str(data$lease_commence_date)
##  int [1:861505] 1977 1977 1977 1977 1976 1977 1977 1977 1977 1977 ...
unique(sort(data$lease_commence_date))
##  [1] 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980
## [16] 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995
## [31] 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010
## [46] 2011 2012 2013 2014 2015 2016 2017 2018 2019
data %>% summarise(na = sum(is.na(lease_commence_date)))
##   na
## 1  0
summary(data$lease_commence_date)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1966    1980    1986    1988    1995    2019
# check whether remaining lease is calculated correctly
data %>%
  filter(!is.na(remaining_lease) && remaining_lease != (99 - (data$int_year - data$lease_commence_date))) %>% 
  select(remaining_lease)
## [1] remaining_lease
## <0 rows> (or 0-length row.names)
  • there’s no missing value

10. resale_price

head(data$resale_price)
## [1]  9000  6000  8000  6000 47200 46000
tail(data$resale_price)
## [1] 500000 580000 540000 738000 592000 545000
str(data$resale_price)
##  num [1:861505] 9000 6000 8000 6000 47200 46000 42000 38000 40000 47000 ...
data %>% summarise(na = sum(is.na(resale_price)))
##   na
## 1  0
summary(data$resale_price)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    5000  185000  282000  302067  395000 1360000
  • there’s no missing value
  • exploration by plotting
barplot(table(data$resale_price), ylim=c(0,8000), las = 2)

  • binning
summary(data$resale_price)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    5000  185000  282000  302067  395000 1360000
max(data$resale_price) -  min(data$resale_price)
## [1] 1355000
lower_boundary <- c(seq(4000, 1224400, by=135600))
upper_boundary <- c(seq(139600, 1360000, by=135600))
lower_boundary
##  [1]    4000  139600  275200  410800  546400  682000  817600  953200 1088800
## [10] 1224400
upper_boundary
##  [1]  139600  275200  410800  546400  682000  817600  953200 1088800 1224400
## [10] 1360000
bin_label <- c(paste(paste(lower_boundary, "-"),upper_boundary))
bin_label
##  [1] "4000 - 139600"     "139600 - 275200"   "275200 - 410800"  
##  [4] "410800 - 546400"   "546400 - 682000"   "682000 - 817600"  
##  [7] "817600 - 953200"   "953200 - 1088800"  "1088800 - 1224400"
## [10] "1224400 - 1360000"
data$resale_price_bins <- cut(data$resale_price, breaks=c(seq(4000, 1360000, by=135600)), labels=bin_label)
 
data %>% select(resale_price, resale_price_bins) %>% head()
##   resale_price resale_price_bins
## 1         9000     4000 - 139600
## 2         6000     4000 - 139600
## 3         8000     4000 - 139600
## 4         6000     4000 - 139600
## 5        47200     4000 - 139600
## 6        46000     4000 - 139600
str(data$resale_price_bins)
##  Factor w/ 10 levels "4000 - 139600",..: 1 1 1 1 1 1 1 1 1 1 ...
summary(data$resale_price_bins)
##     4000 - 139600   139600 - 275200   275200 - 410800   410800 - 546400 
##            105819            311311            257074            129253 
##   546400 - 682000   682000 - 817600   817600 - 953200  953200 - 1088800 
##             39188             13228              4617               843 
## 1088800 - 1224400 1224400 - 1360000 
##               149                23
data %>% filter(is.na(resale_price_bins)) %>% select(resale_price, resale_price_bins) %>% head()
## [1] resale_price      resale_price_bins
## <0 rows> (or 0-length row.names)
  • exploration by plotting
barplot(table(data$resale_price_bins), ylim=c(0,350000), las = 2)

p <- ggplot(data, aes(x = fct_infreq(resale_price_bins), fill = fct_infreq(resale_price_bins))) +
  geom_bar(width = 1, colour = "black", show.legend = FALSE) +
  xlab("resale_price")

p + coord_flip() + scale_y_continuous(limit = c(0, 350000))

# Zoom in to resale_price with frequency < 5,000
data_filter <- data %>%
  group_by(resale_price_bins) %>%
  filter(n() < 5000)

p <- ggplot(data_filter, aes(x = fct_infreq(resale_price_bins), fill = fct_infreq(resale_price_bins))) +
  geom_bar(width = 1, colour = "black", show.legend = FALSE) +
  xlab("resale_price")

p + coord_flip() + scale_y_continuous(limit = c(0, 5000))

  • Resale price ranges from 139600-275200 has the highest transaction records, more than 300,000 rows out of total 861,505 rows

  • Resale price ranges from 1224400-1360000 has the lowest transaction records, less than 500 rows

  • any outliers?

boxplot(data$resale_price ~ data$year)

boxplot(data$resale_price ~ data$region)

boxplot(data$resale_price ~ data$floor_area_sqm_bins)

  • correlation between two variables
plot(data$region ~ data$resale_price)

library(ggplot2)
ggplot(data) +
  aes(x = resale_price, y = floor_area_sqm_bins, colour = region) +
  geom_point() +
  scale_color_hue()

11. remaining_lease

  • transform to month value
  • for NA, calculate the used lease using transaction year and lease commence date then subtract from total 99 lease year
head(data$remaining_lease)
## [1] NA NA NA NA NA NA
tail(data$remaining_lease)
## [1] NA NA NA NA NA NA
str(data$remaining_lease)
##  chr [1:861505] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA ...
data %>% summarise(na = sum(is.na(remaining_lease)))
##       na
## 1 709050
summary(data$remaining_lease)
##    Length     Class      Mode 
##    861505 character character
  • there’s 709050 missing data, calculate the remaining lease using transaction year and lease commence date
data_test <- data
data_test$used_lease <- data_test$int_year - data_test$lease_commence_date

data_test$remaining_lease <- as.integer(99 - data_test$used_lease)
str(data_test$remaining_lease)
##  int [1:861505] 86 86 86 86 85 86 86 86 86 86 ...
data_test %>% summarise(na = sum(is.na(remaining_lease)))
##   na
## 1  0
summary(data_test$remaining_lease)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   44.00   75.00   83.00   81.65   90.00  101.00
data_test %>% filter(remaining_lease > 99) %>% select (int_year, used_lease, lease_commence_date, remaining_lease)
##    int_year used_lease lease_commence_date remaining_lease
## 1      1991         -1                1992             100
## 2      1992         -1                1993             100
## 3      1993         -1                1994             100
## 4      1994         -1                1995             100
## 5      1994         -1                1995             100
## 6      1994         -2                1996             101
## 7      1994         -1                1995             100
## 8      1994         -1                1995             100
## 9      1994         -1                1995             100
## 10     1994         -1                1995             100
## 11     1994         -1                1995             100
## 12     1994         -1                1995             100
## 13     1994         -1                1995             100
## 14     1994         -1                1995             100
## 15     1994         -1                1995             100
## 16     1994         -1                1995             100
## 17     1994         -1                1995             100
## 18     1994         -1                1995             100
## 19     1994         -1                1995             100
## 20     1994         -1                1995             100
## 21     1994         -2                1996             101
## 22     1995         -1                1996             100
## 23     1995         -1                1996             100
## 24     1995         -1                1996             100
## 25     1995         -1                1996             100
## 26     1995         -1                1996             100
## 27     1995         -1                1996             100
## 28     1995         -1                1996             100
## 29     1995         -1                1996             100
## 30     1995         -1                1996             100
## 31     1995         -1                1996             100
## 32     1995         -1                1996             100
## 33     1995         -1                1996             100
## 34     1996         -1                1997             100
## 35     1996         -1                1997             100
## 36     1996         -1                1997             100
## 37     2004         -2                2006             101
## 38     2004         -2                2006             101
## 39     2005         -1                2006             100
## 40     2005         -1                2006             100
## 41     2005         -1                2006             100
## 42     2005         -1                2006             100
## 43     2005         -1                2006             100
## 44     2005         -1                2006             100
## 45     2005         -1                2006             100
## 46     2005         -1                2006             100
## 47     2005         -1                2006             100
## 48     2005         -1                2006             100
## 49     2005         -1                2006             100
## 50     2005         -1                2006             100
## 51     2011         -1                2012             100
data_test %>% summarise(count = sum(remaining_lease > 99))
##   count
## 1    51
# update remaining lease to the actual df
data$remaining_lease <- data_test$remaining_lease

# drop these 51 records
data <- subset(data,remaining_lease <= 99)

summary(data$remaining_lease)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   44.00   75.00   83.00   81.65   90.00   99.00
str(data$remaining_lease)
##  int [1:861454] 86 86 86 86 85 86 86 86 86 86 ...
  • exploration by plotting
barplot(table(data$remaining_lease), ylim=c(0,50000), las = 2)

data_plot <- data
data_plot$remaining_lease <- as.factor(data_plot$remaining_lease)
p <- ggplot(data_plot, aes(x = fct_infreq(remaining_lease), fill = fct_infreq(remaining_lease))) +
  geom_bar(width = 1, colour = "black", show.legend = FALSE) +
  xlab("remaining_lease")

p + coord_flip() + scale_y_continuous(limit = c(0, 50000))

# Zoom in to remaining_lease with frequency < 5,000
data_filter <- data_plot %>% 
  group_by(remaining_lease) %>%
  filter(n() < 1000)

p <- ggplot(data_filter, aes(x = fct_infreq(remaining_lease), fill = fct_infreq(remaining_lease))) +
  geom_bar(width = 1, colour = "black", show.legend = FALSE) +
  xlab("remaining_lease")

p + coord_flip() + scale_y_continuous(limit = c(0, 1000))

  • 94 and 95 remaining lease year has the highest transaction records, more than 45,000 rows out of total 861,505 rows
  • 44 remaining lease year has the lowest transaction records, less than 100 rows
  • remaining lease year > 95 has very low transaction records, should be due to the restriction to sell the HDB if purchased less than N years

Drop Columns

  • drop columns that are useless for our objective
data <- data[,-which(names(data) %in% c("lease_commence_date","int_year"))]

Explore Cleaned Dataset

summary(data)
##       year            month                 town       
##  1999   : 57782   07     : 77636   TAMPINES   : 75695  
##  1998   : 51095   10     : 77447   YISHUN     : 65336  
##  2001   : 38055   08     : 74958   BEDOK      : 63327  
##  2002   : 36098   03     : 73996   JURONG WEST: 62545  
##  1996   : 34916   11     : 73685   WOODLANDS  : 60661  
##  2000   : 34862   06     : 73470   ANG MO KIO : 49537  
##  (Other):608646   (Other):410262   (Other)    :484353  
##             flat_type          block                    street_name    
##  1 ROOM          :  1284   2      :  4423   YISHUN RING RD    : 16597  
##  2 ROOM          : 10326   1      :  3859   BEDOK RESERVOIR RD: 14068  
##  3 ROOM          :280095   110    :  3263   ANG MO KIO AVE 10 : 13227  
##  4 ROOM          :324030   101    :  3249   ANG MO KIO AVE 3  : 11674  
##  5 ROOM          :179771   4      :  3192   HOUGANG AVE 8     :  8877  
##  EXECUTIVE       : 65423   8      :  3156   TAMPINES ST 21    :  7929  
##  MULTI-GENERATION:   525   (Other):840312   (Other)           :789082  
##    storey_range    storey_range_from storey_range_to  floor_area_sqm  
##  04 TO 06:217692   Min.   : 1.000    Min.   : 3.000   Min.   : 28.00  
##  07 TO 09:196254   1st Qu.: 4.000    1st Qu.: 6.000   1st Qu.: 73.00  
##  01 TO 03:175041   Median : 7.000    Median : 9.000   Median : 93.00  
##  10 TO 12:166548   Mean   : 6.593    Mean   : 8.609   Mean   : 95.67  
##  13 TO 15: 55714   3rd Qu.:10.000    3rd Qu.:12.000   3rd Qu.:113.00  
##  16 TO 18: 21167   Max.   :49.000    Max.   :51.000   Max.   :307.00  
##  (Other) : 29038                                                      
##              flat_model      resale_price     remaining_lease
##  MODEL A          :239958   Min.   :   5000   Min.   :44.00  
##  IMPROVED         :226084   1st Qu.: 185000   1st Qu.:75.00  
##  NEW GENERATION   :181433   Median : 282000   Median :83.00  
##  SIMPLIFIED       : 55122   Mean   : 302064   Mean   :81.65  
##  STANDARD         : 40863   3rd Qu.: 395000   3rd Qu.:90.00  
##  PREMIUM APARTMENT: 39555   Max.   :1360000   Max.   :99.00  
##  (Other)          : 78439                                    
##         region       storey_range_mean floor_area_sqm_bins    new_flat_model  
##  Central   :143439   Min.   : 2.000    87 - 107 :255075    Flat      :512514  
##  East      :170437   1st Qu.: 5.000    67 - 87  :179352    Maisonette: 30089  
##  North     :137662   Median : 8.000    107 - 127:171890    Others    :277943  
##  North-East:161389   Mean   : 7.601    47 - 67  :153445    Premium   : 39715  
##  Suburb    : 33743   3rd Qu.:11.000    127 - 147: 67431    Terrace   :  1193  
##  West      :214784   Max.   :50.000    147 - 167: 21816                       
##                                        (Other)  : 12445                       
##        resale_price_bins 
##  139600 - 275200:311302  
##  275200 - 410800:257045  
##  410800 - 546400:129244  
##  4000 - 139600  :105818  
##  546400 - 682000: 39185  
##  682000 - 817600: 13228  
##  (Other)        :  5632
str(data)
## 'data.frame':    861454 obs. of  18 variables:
##  $ year               : Factor w/ 32 levels "1990","1991",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ month              : Factor w/ 12 levels "01","02","03",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ town               : Factor w/ 27 levels "ANG MO KIO","BEDOK",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ flat_type          : Factor w/ 7 levels "1 ROOM","2 ROOM",..: 1 1 1 1 3 3 3 3 3 3 ...
##  $ block              : Factor w/ 2567 levels "1","10","100",..: 799 799 799 799 431 414 372 503 503 498 ...
##  $ street_name        : Factor w/ 576 levels "ADMIRALTY DR",..: 14 14 14 14 14 17 17 17 17 17 ...
##  $ storey_range       : Factor w/ 25 levels "01 TO 03","01 TO 05",..: 6 3 6 5 3 1 5 6 3 1 ...
##  $ storey_range_from  : int  10 4 10 7 4 1 7 10 4 1 ...
##  $ storey_range_to    : int  12 6 12 9 6 3 9 12 6 3 ...
##  $ floor_area_sqm     : num  31 31 31 31 73 67 67 67 67 67 ...
##  $ flat_model         : Factor w/ 20 levels "2-ROOM","ADJOINED FLAT",..: 5 5 5 5 12 12 12 12 12 12 ...
##  $ resale_price       : num  9000 6000 8000 6000 47200 46000 42000 38000 40000 47000 ...
##  $ remaining_lease    : int  86 86 86 86 85 86 86 86 86 86 ...
##  $ region             : Factor w/ 6 levels "Central","East",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ storey_range_mean  : num  11 5 11 8 5 2 8 11 5 2 ...
##  $ floor_area_sqm_bins: Factor w/ 14 levels "27 - 47","47 - 67",..: 1 1 1 1 3 2 2 2 2 2 ...
##  $ new_flat_model     : Factor w/ 5 levels "Flat","Maisonette",..: 1 1 1 1 3 3 3 3 3 3 ...
##  $ resale_price_bins  : Factor w/ 10 levels "4000 - 139600",..: 1 1 1 1 1 1 1 1 1 1 ...

Export Cleaned Dataset

write.csv(data,"../data/data_clean.csv", row.names = FALSE)